Goal: Compare super claims 1, 3, and 5. 1 : Not Happening 3: Climate Impacts Not Bad 5: Science/Scientist Not Reliable

knitr::opts_chunk$set(echo = TRUE)
library(jsonlite) # allows us to read in json files
library(tidyverse) # allows us to do lots of data manipulation and basic data science
library(here) # allows us to cut out long file paths (ex. "users/connor/dowloads/etc")
library(forcats) # 
library(tidytext) # allows us to tokenize data 
library(dplyr) # allows us to manipulate dataframes
library(stringr) # allows us to count the number of words in a cell
library(quanteda) # allows us to tokenize data
library(quanteda.textplots) # allows us to make network plots
library(gridExtra) # allows us to combine multiple plots into 1
library(wordcloud) # allows us to generate word clouds
library(fmsb)
library(plotly)
library(ggthemes)
library(tm)

Super Claim #1 Not Happening

nature_analysis <- read_csv(here("data/training.csv"))
## Rows: 23436 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text, claim
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Filter() to select super claim 1

na_1 <- nature_analysis %>%
  filter(str_detect(claim, "1_"))

Add word_count column using mutate()

na_1 <- na_1 %>% 
  mutate(word_count = str_count(na_1$text, "\\S+"))

#Distribution visual, geom_histogram

ggplot(na_1, aes(x = word_count, fill = claim)) +
  geom_histogram(bins = 67, color = "black") +
  theme_wsj()+
  theme(text = element_text(family = "Menlo-Bold", size = 12),
        legend.title = element_text(family = "Menlo-Bold", size = 12)) +
  labs(title = "Distribution of Claims", 
       subtitle = "Claim 1")

Tokenize using unnest_tokens() to seprate text into words

na_1_tokenized <- na_1 %>% 
  unnest_tokens(words, text)

na_1_tokenized <- na_1_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_1_tokenized <- na_1_tokenized %>% 
  filter(!words %in% stopwords("english"))

#Word Cloud visual

wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = brewer.pal(12, "Paired"))
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : predictions could not be fit on page. It will not be plotted.
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : alarmists could not be fit on page. It will not be plotted.
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : minimum could not be fit on page. It will not be plotted.
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : recorded could not be fit on page. It will not be plotted.

na_1_corpus <- corpus(na_1$text)

toks <- na_1_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

Super Claim 3 Climate Impacts Not Bad FIlter() for super claim 3

na_3 <- nature_analysis %>%
  filter(str_detect(claim, "3_"))

Add word_count column using mutate()

na_3 <- na_3 %>% 
  mutate(word_count = str_count(na_3$text, "\\S+"))

#Distribution visual, geom_histogram

ggplot(na_3, aes(x = word_count, fill = claim)) +
  geom_histogram(bins = 67, color = "black") +
  theme_wsj()+
  theme(text = element_text(family = "Menlo-Bold", size = 12),
        legend.title = element_text(family = "Menlo-Bold", size = 12)) +
  labs(title = "Distribution of Claims", 
       subtitle = "Claim 3")

Tokenize using unnest_tokens()

na_3_tokenized <- na_3 %>% 
  unnest_tokens(words, text)

na_3_tokenized <- na_3_tokenized %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_3_tokenized <- na_3_tokenized %>%
  anti_join(stop_words, by = c("words" = "word")) %>%
  filter(!words %in% c("et", "al", "2"))

#Word cloud visual

wordcloud(na_3_tokenized$words, freq = na_3_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, colors = brewer.pal(12, "Paired"))

na_3_corpus <- corpus(na_3$text)

toks <- na_3_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

Super Claim 5 Science/Scientist Not Reliable FIlter() for super claim 5

na_5 <- nature_analysis %>%
  filter(str_detect(claim, "5_"))

Add word_count column using mutate()

na_5 <- na_5 %>% 
  mutate(word_count = str_count(na_5$text, "\\S+"))

#Distribution visual, geom_histogram

ggplot(na_5, aes(x = word_count, fill = claim)) +
  geom_histogram(bins = 67, color = "black") +
  theme_wsj()+
  theme(text = element_text(family = "Menlo-Bold", size = 12),
        legend.title = element_text(family = "Menlo-Bold", size = 12)) +
  labs(title = "Distribution of Claims", 
       subtitle = "Claim 5")

Tokenize using unnest_tokens()

na_5_tokenzied <- nature_analysis %>% 
  unnest_tokens(words, text)

na_5_tokenzied <- na_5_tokenzied %>% 
  count(words) %>% 
  arrange(desc(n))

Filter() out stopwords()

na_5_tokenzied <- na_5_tokenzied %>% 
  filter(!words %in% stopwords("english"))
wordcloud(na_5_tokenzied$words, freq = na_5_tokenzied$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))

na_5_corpus <- corpus(na_5$text)

toks <- na_5_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

na_1_matrix <- as.matrix.data.frame(na_1_tokenized)
na_3_matrix <- as.matrix.data.frame(na_3_tokenized)
na_5_matrix <- as.matrix.data.frame(na_5_tokenzied)

na_1_text <- apply(na_1_matrix, 1, toString)
na_3_text <- apply(na_3_matrix, 1, toString)
na_5_text <- apply(na_5_matrix, 1, toString)
if (require(tm)) {
  # Replace the following code with your own text data
  texts <- c("na_1_text", "na_3_text", "na_5_text")
  
  # Create a corpus from the text data
  corp <- Corpus(VectorSource(texts))
  
  # Preprocess the corpus
  #corp <- tm_map(corp, removePunctuation)
  #corp <- tm_map(corp, content_transformer(tolower))
  #corp <- tm_map(corp, removeNumbers)
  #corp <- tm_map(corp, removeWords, stopwords())
  
  # Create the term document matrix
  term.matrix <- DocumentTermMatrix(corp)
  term.matrix <- as.matrix(term.matrix)
  
  # Assign column names to the matrix
  colnames(term.matrix) <- paste0("Document ", 1:ncol(term.matrix))
  
  # Generate the word cloud
  comparison.cloud(term.matrix, max.words = 40, random.order = FALSE)
  comparison.cloud(term.matrix, max.words = 40, random.order = FALSE,
                   title.colors = c("red", "blue"), title.bg.colors = c("grey40", "grey70"))
  comparison.cloud(term.matrix, max.words = 40, random.order = FALSE,
                   match.colors = TRUE)
}